
## this is the core analytical file for underproduction analysis (diss project 1 + 2 GLUE for EMSE)
## run locally

## Step 0: load libraries, set variables, make some helper functions
library(dplyr)
library(reldist)
library(ggplot2)
library(scales)
library(stringr)
library(tidyr)
library(eulerr)
library(effects)
library(ggeffects)
library(sjPlot)
library(texreg)

setwd("~/Research/kaylea_dissertation/eseJournal")
## yearsOld is package 
load('processed_data/fullDataset.RData') 
figDir = '~/Dropbox/Apps/Overleaf/EMSE\ SANER\ Call\ --\ Underproduction/figures/'
pubDataDir = '~/Dropbox/Apps/Overleaf/EMSE\ SANER\ Call\ --\ Underproduction/knitr_rdata/'
source('lib-00-utils.R') #loads up helpers for making Rtex docs etc

reverselog_trans <- function(base = exp(1)) {
  trans <- function(x) -log(x, base)
  inv <- function(x) base^(-x)
  trans_new(paste0("reverselog-", format(base)), trans, inv, 
            log_breaks(base = base), 
            domain = c(1e-100, Inf))
}

generateMeanLangAges <- function(df) {
  langList <- colnames(langDF)[-1]
  df$meanLangAge <- NA
  for (i in seq(length(df$package))) {
#    print(i)
    my.totLangs <- 0
    my.langAgeSum <- 0
    for (lang in langList) {
#      print('now examining')
#      print(lang)
#      print('for package')
#      print(df[i,]$package)
      if (is.na(df[i,lang])) {
#        print('skipping NA')
        next #skip all the NAs
      }
      if (df[i,lang] ==0) {
#        print('skipping 0')
        next #skip all the tagged but not using language X 
      }
#      print(df[i,lang])
      my.totLangs = my.totLangs + 1     
      my.langAgeSum <- my.langAgeSum + (df[i,lang] * as.numeric(subset(langAgeDF, langAgeDF$language==lang)$deltaNow))
    }
    if (my.totLangs != 0) { #otherwise leave as NA
      df[i,'meanLangAge'] <- my.langAgeSum / my.totLangs
    }
  }
  return(df)
}

## Step 1: Some cleanup and type fixes. Calculate key variable prop_team
# 
newRow <- langAgeDF[langAgeDF$language == "c++",] ## R doesn't like ++ in colnames so it munges this into c.. in some cases
newRow$language <- "c.."
langAgeDF <- rbind(langAgeDF, newRow)
newRow <- langAgeDF[langAgeDF$language == "c-sharp",] ## R doesn't like - in colnames so it munges this into c.. in some cases
newRow$language <- "c.sharp"
langAgeDF <- rbind(langAgeDF, newRow)

packageDF <- packageDF %>% mutate(maintainerTurnover = case_when(maintainerCount > 1 ~ TRUE,
                                                                 TRUE ~ FALSE))
packageDF <- unique(packageDF)
packageDF$maintainerTurnover <- as.factor(packageDF$maintainerTurnover)
packageDF$yearsOld <- as.numeric(packageDF$yearsOld)

packageDF$prop_team <- packageDF$list_maint / packageDF$uploadCount
summary(packageDF$prop_team)
hist(packageDF$prop_team) #note that it is rather bimodal

langAgeDF$deltaNow <- 2023 - as.numeric(langAgeDF$release.one)

#debugging this function with a test subset
#testDF <- subset(packageDF, packageDF$inst.rank < 100)
#temp <- generateMeanLangAges(testDF)
packageDF <- generateMeanLangAges(packageDF)

## Step 2: Run Models 


## M1: no language or network measures
m1 <- glm(underprod ~ yearsOld + uploaderCount + maintainerTurnover + prop_team, data=packageDF, family="binomial")
summary(m1)

## M2: No language measures
m2 <- glm(underprod ~ yearsOld + uploaderCount + maintainerTurnover + prop_team + Eig.comment + Betweenness.comment, data=packageDF, family="binomial")
summary(m2)

## M3: No network measures
m3 <- glm(underprod ~ yearsOld + meanLangAge + yearsOld*meanLangAge + uploaderCount + maintainerTurnover + prop_team, data=packageDF, family="binomial")
summary(m3)

## M4: all measures

m4 <- glm(underprod ~ yearsOld + meanLangAge + yearsOld*meanLangAge + uploaderCount + maintainerTurnover + prop_team + Eig.comment + Betweenness.comment, data=packageDF, family="binomial")
summary(m4)

### will need these preds for the visuals later
preds <- ggpredict(m4, terms=c("yearsOld [0:30]", "meanLangAge [25, 48]"))
#preds <- data.frame(predict.glm(allFactors2, type="response", se.fit=TRUE))

## save out what the paper needs via remember

## number of aliases
## monster table is all four models, m1, m2, m3, and m4
## m4 coefs for H1, H2, H4, H5, H6, H7, H8 interp

con <- textConnection("monster.texreg", "w") #remembered
sink(con, split=TRUE, type="output")
texreg(list(m1,m2,m3,m4), stars=NULL, digits=2,
       custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'), 
       custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'), 
       use.packages=FALSE, table=FALSE, ci.force = TRUE)
sink()
close(con);rm(con)

m4.coefs <- summary(m4)$coefficients ## for easy reference within the latex document

### extended investigation of hhi and productive inequality

cor.test(as.numeric(packageDF$maintainerTurnover), packageDF$underprod)
cor.test(as.numeric(packageDF$hhi.manual), packageDF$underprod) #higher concentration of maintainership is associated with lower underproduction
hist(packageDF$hhi.manual)

g <- ggplot(packageDF, aes(hhi.manual)) +
  geom_histogram() +
  theme_bw() + 
  labs(x="Concentration (HHI)", y="Count")  +
  theme(legend.position = "bottom")
g


pdf(paste0(figDir, "HHI.pdf"))
g
dev.off()

max(packageDF$hhi.manual)
min(packageDF$hhi.manual)
mean(packageDF$hhi.manual)
median(packageDF$hhi.manual)


g <- ggplot(packageDF, aes(x=is_underprod, y=hhi.manual, fill=is_underprod)) +
  geom_boxplot() 

g

## dichotomize by whether there's any turnover at all
g <- ggplot(subset(packageDF, packageDF$maintainerTurnover==TRUE), aes(x=is_underprod, y=hhi.manual, fill=is_underprod)) +
  geom_boxplot() 

g


m5.limit <- glm(underprod ~ hhi.manual, data=packageDF, family="binomial")

summary(m5.limit)

con <- textConnection("m5.limit.texreg", "w") #remembered
sink(con, split=TRUE, type="output")
texreg(m5.limit, stars=NULL, digits=2,
       custom.coef.names=c('(Intercept)', 'HHI'), 
       custom.model.names = 'Underproduction and Productive Inequalities',
       use.packages=FALSE, table=FALSE, ci.force = TRUE)
sink()
close(con);rm(con)


m5 <- glm(underprod ~ yearsOld + meanLangAge + yearsOld*meanLangAge + uploaderCount +  maintainerTurnover +
            prop_team + Eig.comment + Betweenness.comment + hhi.manual, data=packageDF, family="binomial")

summary(m5)
m5.res <- residuals(m5)
hist(m5.res)
m4.res <- residuals(m4)
hist(m4.res)


con <- textConnection("m5.texreg", "w") #remembered
sink(con, split=TRUE, type="output")
texreg(m5, stars=NULL, digits=2,
       custom.coef.names=c('(Intercept)', 'Package Age (years)',  'Mean Language Age', 
                           'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 
                           'Betweenness Centrality', 'HHI', 'Package Age : Mean Language Age'), 
       use.packages=FALSE, table=FALSE, ci.force = TRUE)
sink()
close(con);rm(con)



g <- ggplot(packageDF, aes(x=is_underprod, y=numMaintainers)) +
  geom_boxplot() 

g



g <- ggplot(packageDF, aes(x=hhi.manual, y=up.fac.mean)) +
  geom_smooth() +
  geom_point(alpha=0.1)

g

g <- ggplot(subset(packageDF, packageDF$maintainerTurnover==TRUE), aes(x=hhi.manual, y=maintainerCount)) +
  geom_smooth() 

g



## colis validation

threshold <- 5 # if it has the bug, we'd expect it in pct 0-4
#packageDF.colis <- read.csv('by_inst_20240920.tsv', sep="\t")
packageDF.colis <- read.csv('bullseye_usage_20220215.tsv', sep="\t", header = FALSE)
colnames(packageDF.colis) <- c('package','inst')

### popcon contains a lot of packages that are no longer distributed or weren't part of bullseye, which is what colis looked at


colisDF <- data.frame(read.csv('colisFoundBugs.tsv', header=FALSE, sep="\t"))
colnames(colisDF) <- c('bug_id', 'package', 'severity')

## quality score:
## 0: no bug
## 0: wishlist
## 1: minor bug
## 5: normal bug
## 10: important bug
## 25: serious -- release critical
## 50: grave -- release critical
## 100: critical -- release-critical

colisDF <- colisDF %>% mutate(score = case_when(
                severity == 'wishlist' ~ 0, 
                severity == 'minor' ~ 1, 
                severity == 'normal' ~ 10, 
                severity == 'important' ~ 50, 
                severity == 'serious' ~ 250,  ## not in dataset fwiw
                severity == 'grave' ~ 500,  ## not in dataset fwiw
                severity == 'critical' ~ 1000, ## not in dataset fwiw
                TRUE ~ NA
            )) 

colisDF.collapse <- colisDF
colisDF.collapse$bug_id <- NULL
colisDF.collapse <- colisDF.collapse %>% group_by(package) %>% reframe(package=package,
                                                                quality=-sum(score)) ## note that "negative" quality is bad
dim(colisDF.collapse)
colisDF.collapse <- unique(colisDF.collapse) # a few dupes removed
dim(colisDF.collapse)
hist(colisDF.collapse$quality)
n.colis.pk <- length(unique(colisDF.collapse$package))
n.colis <- length(colisDF$package)

colisDF.full <- merge(colisDF, colisDF.collapse, by='package')

bullDF <- read.csv('bullseye_packages.txt', header=FALSE)
colnames(bullDF) <- c('package')

dim(packageDF.colis)
packageDF.colis <- subset(packageDF.colis, packageDF.colis$package %in% bullDF$package) #only things that were in bullseye at release count
dim(packageDF.colis) #should be unchanged
packageDF.colis <- unique(packageDF.colis)
dim(packageDF.colis) #no dupes
hist(packageDF.colis$inst)
hist(log(packageDF.colis$inst)) 

packageDF.colis <- merge(packageDF.colis, colisDF.full, by='package', all.x=TRUE)
dim(packageDF.colis)
# fill all the NAs with max value
minQual <- min(colisDF.full$quality)
maxQual <- -minQual
packageDF.colis$quality[is.na(packageDF.colis$quality)] <- maxQual  ## we have NAs and negative (bad) quality, nonbug is top quality

# look for missings -- have colis bug but not in popcon
miss <- subset(colisDF, !(colisDF$package %in% packageDF.colis$package))
miss ## each of these are explained in notes 


table(packageDF.colis$quality) #negative quality is bad; now we see most are at 60 (no bug)
### ranking 


packageDF.colis$rank.i <- rank(packageDF.colis$inst, ties.method = "max") #lots of installs gives high rank
packageDF.colis$rank.q <- rank(packageDF.colis$quality, ties.method="max") # positive quality gives high rank
max(packageDF.colis$rank.i)
max(packageDF.colis$rank.q)
packageDF.colis$upfactor <- log(packageDF.colis$rank.i / packageDF.colis$rank.q) ## positive is higher risk, negative is lower risk
hist(packageDF.colis$upfactor)

colisTab <- table(packageDF.colis$upfactor)
colisTab
upDF.colis <- subset(packageDF.colis, packageDF.colis$upfactor > 0)
upDF.colis

library(xtable)

upDF.colis.tableready <- upDF.colis
upDF.colis.tableready$rank.i <- NULL
upDF.colis.tableready$rank.q <- NULL
upDF.colis.tableready$inst <- NULL
upDF.colis.tableready$quality <- NULL
upDF.colis.tableready$bug_id <- NULL
upDF.colis.tableready$severity <- NULL
upDF.colis.tableready$score <- NULL
upDF.colis.tableready <- unique(upDF.colis.tableready)
upDF.colis.tableready <- arrange(upDF.colis.tableready, desc(upfactor), package)
row.names(upDF.colis.tableready) <- NULL ## leftover from old dataframe
colnames(upDF.colis.tableready) <- c('Package','COLIS Underproduction Factor')
upDF.colis.tableready <- upDF.colis.tableready[1:25,]
upDF.colis.tableready

upDF.tabified <- xtable(upDF.colis.tableready, digits=c(2))

con <- textConnection("upDF.colis.tex", "w") #remembered
sink(con, split=TRUE, type="output")
print(upDF.tabified,
      only.contents=TRUE,
      include.rownames=FALSE,
      comment = FALSE)
sink()
close(con);rm(con)


checkDF <- merge(packageDF, packageDF.colis, by='package')

table(checkDF$bug_id)
checkDF$bug_id[is.na(checkDF$bug_id)] <- checkDF$package[is.na(checkDF$bug_id)]   ## we have NAs that are supposed to be acting as unique IDs in the colis dataset 
table(checkDF$bug_id)

bugTab <- as.data.frame(table(checkDF$bug_id))
subset(bugTab, bugTab$Freq != 1) ## no duplicate bugs, so independent
cor.test(checkDF$up.fac.mean, checkDF$upfactor) ## p-value not useful if not independent

## 

## future: if we have any duplicate bug ids in the checkDF, must model as nested
#library(lme4)
#icc.model <- lmer(up.fac.mean ~ 1 + (1|bug_id) , data=checkDF)
#icc.model
#resM1 <- summary(icc.model)
#sigma2s <- data.frame(VarCorr(icc.model))[,"vcov"]
#sigma2alpha <- sigma2s[1]
#sigma2y <- sigma2s[2]
#muAlpha <- resM1$coefficients[1]
#icc <- sigma2alpha / (sigma2alpha + sigma2y)
#ricc <- round(icc, 4)
#ricc


#gap.model.1 <- lmer(upfactor ~ up.fac.mean + (1 | bug_id), data=checkDF)#todo - check this is what I want
#gap.model.1
#summary(gap.model.1)

#screenreg(gap.model.1, digits=4, ci.force = TRUE)

#con <- textConnection("predictFutureTex", "w") #how we save R objects to strings
#sink(con, split=TRUE, type="output")

#texreg(gap.model.1, stars=NULL,
#       custom.model.names = c('Past and Future Underproduction'),
#       use.packages=FALSE,
#       table=FALSE,
#       digits=4,
#       ci.force = TRUE)

#sink()
#close(con);rm(con)
#gap.model.2 <- lmer(up.fac.mean ~ upfactor + (1 + upfactor | bug_id), data=checkDF)#todo - check this is what I want
#gap.model.2 ## fails -- no fit b/c most packages have one entry
#summary(gap.model.2)





if (!nosave) {
  r <- list()
  remember(upDF.colis.tex)
  remember(m4.coefs)
  remember(monster.texreg)
  remember(m5.texreg)
  remember(m5.limit.texreg)
  remember(n.colis.pk)
  remember(n.colis)
  save(r, file=paste0(pubDataDir, "knitr_data.RData"), version=2)
  rm(r)
}

screenreg(m5)

#build the figures for the paper by running visuals.R <-- do locally
